{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "html = \"\"\"\n",
    "<html><head><title>The Dormouse's story</title></head>\n",
    "<body>\n",
    "<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>\n",
    "<p class=\"story\">Once upon a time there were three little sisters; and their names were\n",
    "<a href=\"http://example.com/elsie\" class=\"sister\" id=\"link1\"><!-- Elsie --></a>,\n",
    "<a href=\"http://example.com/lacie\" class=\"sister\" id=\"link2\">Lacie</a> and\n",
    "<a href=\"http://example.com/tillie\" class=\"sister\" id=\"link3\">Tillie</a>;\n",
    "and they lived at the bottom of a well.</p>\n",
    "<p class=\"story\">...</p>\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "F:\\Program\\Anaconda2\\lib\\site-packages\\bs4\\__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
      "\n",
      "The code that caused this warning is on line 174 of the file F:\\Program\\Anaconda2\\lib\\runpy.py. To get rid of this warning, change code that looks like this:\n",
      "\n",
      " BeautifulSoup(YOUR_MARKUP})\n",
      "\n",
      "to this:\n",
      "\n",
      " BeautifulSoup(YOUR_MARKUP, \"lxml\")\n",
      "\n",
      "  markup_type=markup_type))\n"
     ]
    }
   ],
   "source": [
    "soup = BeautifulSoup(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html>\n",
      " <head>\n",
      "  <title>\n",
      "   The Dormouse's story\n",
      "  </title>\n",
      " </head>\n",
      " <body>\n",
      "  <p class=\"title\" name=\"dromouse\">\n",
      "   <b>\n",
      "    The Dormouse's story\n",
      "   </b>\n",
      "  </p>\n",
      "  <p class=\"story\">\n",
      "   Once upon a time there were three little sisters; and their names were\n",
      "   <a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\">\n",
      "    <!-- Elsie -->\n",
      "   </a>\n",
      "   ,\n",
      "   <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">\n",
      "    Lacie\n",
      "   </a>\n",
      "   and\n",
      "   <a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">\n",
      "    Tillie\n",
      "   </a>\n",
      "   ;\n",
      "and they lived at the bottom of a well.\n",
      "  </p>\n",
      "  <p class=\"story\">\n",
      "   ...\n",
      "  </p>\n",
      " </body>\n",
      "</html>\n"
     ]
    }
   ],
   "source": [
    "print soup.prettify()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<title>The Dormouse's story</title>\n"
     ]
    }
   ],
   "source": [
    "# \n",
    "\n",
    "print soup.title"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 四大对象种类\n",
    "Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:\n",
    "\n",
    "> * Tag\n",
    "> * NavigableString\n",
    "> * BeautifulSoup\n",
    "> * Comment\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The Dormouse's story\n"
     ]
    }
   ],
   "source": [
    "print soup.title.string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The Dormouse's story\n"
     ]
    }
   ],
   "source": [
    "print soup.title.string.encode('utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<head><title>The Dormouse's story</title></head>\n"
     ]
    }
   ],
   "source": [
    "print soup.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>\n"
     ]
    }
   ],
   "source": [
    "print soup.a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<p class=\"title\" name=\"dromouse\"><b>The Dormouse's story</b></p>\n"
     ]
    }
   ],
   "source": [
    "print soup.p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'bs4.element.Tag'>\n"
     ]
    }
   ],
   "source": [
    "print type(soup.a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[document]\n"
     ]
    }
   ],
   "source": [
    "print soup.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "head\n"
     ]
    }
   ],
   "source": [
    "print soup.head.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'class': ['title'], 'name': 'dromouse'}\n"
     ]
    }
   ],
   "source": [
    "print soup.p.attrs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['title']\n"
     ]
    }
   ],
   "source": [
    "print soup.p['class']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['title']\n"
     ]
    }
   ],
   "source": [
    "print soup.p.get('class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<p class=\"newClass\" name=\"dromouse\"><b>The Dormouse's story</b></p>\n"
     ]
    }
   ],
   "source": [
    "soup.p['class'] = \"newClass\"\n",
    "print soup.p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<type 'unicode'>\n"
     ]
    }
   ],
   "source": [
    "print type(soup.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[document]\n"
     ]
    }
   ],
   "source": [
    "print soup.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{}\n"
     ]
    }
   ],
   "source": [
    "print soup.attrs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>\n"
     ]
    }
   ],
   "source": [
    "print soup.a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Elsie \n"
     ]
    }
   ],
   "source": [
    "print soup.a.string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'bs4.element.Comment'>\n"
     ]
    }
   ],
   "source": [
    "print type(soup.a.string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Elsie \n"
     ]
    }
   ],
   "source": [
    "import bs4\n",
    "if type(soup.a.string) == bs4.element.Comment:\n",
    "    print soup.a.string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<title>The Dormouse's story</title>]\n"
     ]
    }
   ],
   "source": [
    "print soup.head.contents"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 遍历文档树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<title>The Dormouse's story</title>]\n"
     ]
    }
   ],
   "source": [
    "print soup.head.contents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<title>The Dormouse's story</title>\n"
     ]
    }
   ],
   "source": [
    "print soup.head.contents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<listiterator object at 0x0000000006028630>\n"
     ]
    }
   ],
   "source": [
    "print soup.head.children"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "<p class=\"newClass\" name=\"dromouse\"><b>The Dormouse's story</b></p>\n",
      "\n",
      "\n",
      "<p class=\"story\">Once upon a time there were three little sisters; and their names were\n",
      "<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\n",
      "<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a> and\n",
      "<a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>;\n",
      "and they lived at the bottom of a well.</p>\n",
      "\n",
      "\n",
      "<p class=\"story\">...</p>\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for child in soup.body.children:\n",
    "    print child"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html><head><title>The Dormouse's story</title></head>\n",
      "<body>\n",
      "<p class=\"newClass\" name=\"dromouse\"><b>The Dormouse's story</b></p>\n",
      "<p class=\"story\">Once upon a time there were three little sisters; and their names were\n",
      "<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\n",
      "<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a> and\n",
      "<a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>;\n",
      "and they lived at the bottom of a well.</p>\n",
      "<p class=\"story\">...</p>\n",
      "</body></html>\n",
      "<head><title>The Dormouse's story</title></head>\n",
      "<title>The Dormouse's story</title>\n",
      "The Dormouse's story\n",
      "\n",
      "\n",
      "<body>\n",
      "<p class=\"newClass\" name=\"dromouse\"><b>The Dormouse's story</b></p>\n",
      "<p class=\"story\">Once upon a time there were three little sisters; and their names were\n",
      "<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\n",
      "<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a> and\n",
      "<a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>;\n",
      "and they lived at the bottom of a well.</p>\n",
      "<p class=\"story\">...</p>\n",
      "</body>\n",
      "\n",
      "\n",
      "<p class=\"newClass\" name=\"dromouse\"><b>The Dormouse's story</b></p>\n",
      "<b>The Dormouse's story</b>\n",
      "The Dormouse's story\n",
      "\n",
      "\n",
      "<p class=\"story\">Once upon a time there were three little sisters; and their names were\n",
      "<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\n",
      "<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a> and\n",
      "<a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>;\n",
      "and they lived at the bottom of a well.</p>\n",
      "Once upon a time there were three little sisters; and their names were\n",
      "\n",
      "<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>\n",
      " Elsie \n",
      ",\n",
      "\n",
      "<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>\n",
      "Lacie\n",
      " and\n",
      "\n",
      "<a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>\n",
      "Tillie\n",
      ";\n",
      "and they lived at the bottom of a well.\n",
      "\n",
      "\n",
      "<p class=\"story\">...</p>\n",
      "...\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for child in soup.descendants:\n",
    "    print child"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The Dormouse's story\n"
     ]
    }
   ],
   "source": [
    "print soup.head.string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The Dormouse's story\n"
     ]
    }
   ],
   "source": [
    "print soup.title.string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "u\"The Dormouse's story\"\n",
      "u\"The Dormouse's story\"\n",
      "u'Once upon a time there were three little sisters; and their names were'\n",
      "u','\n",
      "u'Lacie'\n",
      "u'and'\n",
      "u'Tillie'\n",
      "u';\\nand they lived at the bottom of a well.'\n",
      "u'...'\n"
     ]
    }
   ],
   "source": [
    "for string in soup.stripped_strings:\n",
    "    print(repr(string))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "body\n"
     ]
    }
   ],
   "source": [
    "p = soup.p\n",
    "print p.parent.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "title\n"
     ]
    }
   ],
   "source": [
    "content = soup.head.title.string\n",
    "print content.parent.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "title\n",
      "head\n",
      "html\n",
      "[document]\n"
     ]
    }
   ],
   "source": [
    "content = soup.head.title.string\n",
    "for parent in content.parents:\n",
    "    print parent.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "u',\\n'\n",
      "<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>\n",
      "u' and\\n'\n",
      "<a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>\n",
      "u';\\nand they lived at the bottom of a well.'\n"
     ]
    }
   ],
   "source": [
    "for sibling in soup.a.next_siblings:\n",
    "    print(repr(sibling))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<title>The Dormouse's story</title>\n"
     ]
    }
   ],
   "source": [
    "print soup.head.next_element"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'last_a_tag' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m\u001b[0m",
      "\u001b[1;31mNameError\u001b[0mTraceback (most recent call last)",
      "\u001b[1;32m<ipython-input-42-a3609203387c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# .next_elements 后所有 .previous_elements 前所有\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0melement\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlast_a_tag\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_elements\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m     \u001b[1;32mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrepr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0melement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'last_a_tag' is not defined"
     ]
    }
   ],
   "source": [
    "# .next_elements 后所有 .previous_elements 前所有\n",
    "\n",
    "for element in last_a_tag.next_elements:\n",
    "    print(repr(element))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 搜索文档树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<b>The Dormouse's story</b>]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all('b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>, <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>, <a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.find_all('a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "body\n",
      "b\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "for tag in soup.find_all(re.compile(\"^b\")):\n",
    "    print(tag.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<b>The Dormouse's story</b>,\n",
       " <a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\n",
       " <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>,\n",
       " <a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all([\"a\", \"b\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "html\n",
      "head\n",
      "title\n",
      "body\n",
      "p\n",
      "b\n",
      "p\n",
      "a\n",
      "a\n",
      "a\n",
      "p\n"
     ]
    }
   ],
   "source": [
    "for tag in soup.find_all(True):\n",
    "    print(tag.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def has_class_but_no_id(tag):\n",
    "    return tag.has_attr('class') and not tag.has_attr('id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<p class=\"newClass\" name=\"dromouse\"><b>The Dormouse's story</b></p>,\n",
       " <p class=\"story\">Once upon a time there were three little sisters; and their names were\\n<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\\n<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a> and\\n<a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>;\\nand they lived at the bottom of a well.</p>,\n",
       " <p class=\"story\">...</p>]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(has_class_but_no_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(id='link2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(href=re.compile(\"elsie\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>]"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(href=re.compile(\"elsie\"),id='link1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\n",
       " <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>,\n",
       " <a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(\"a\", class_=\"sister\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_soup = BeautifulSoup('<div data-foo=\"value\">foo!</div>')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "keyword can't be an expression (<ipython-input-59-34fc3cde6712>, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-59-34fc3cde6712>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m    data_soup.find_all(data-foo=\"value\")\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m keyword can't be an expression\n"
     ]
    }
   ],
   "source": [
    "data_soup.find_all(data-foo=\"value\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<div data-foo=\"value\">foo!</div>]"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_soup.find_all(attrs={\"data-foo\": \"value\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(text=\"Elsie\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'Lacie', u'Tillie']"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(text=[\"Tillie\", \"Elsie\", \"Lacie\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u\"The Dormouse's story\", u\"The Dormouse's story\"]"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(text=re.compile(\"Dormouse\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>,\n",
       " <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(\"a\", limit=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<title>The Dormouse's story</title>]"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.html.find_all(\"title\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<title>The Dormouse's story</title>]"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(\"title\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.html.find_all(\"title\", recursive=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CSS选择器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<title>The Dormouse's story</title>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('title')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>, <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>, <a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<b>The Dormouse's story</b>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>, <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>, <a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('.sister')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('#link1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('p #link1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<title>The Dormouse's story</title>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('head > title') #直接子标签查找"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>, <a class=\"sister\" href=\"http://example.com/lacie\" id=\"link2\">Lacie</a>, <a class=\"sister\" href=\"http://example.com/tillie\" id=\"link3\">Tillie</a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('a[class=\"sister\"]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('a[href=\"http://example.com/elsie\"]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<a class=\"sister\" href=\"http://example.com/elsie\" id=\"link1\"><!-- Elsie --></a>]\n"
     ]
    }
   ],
   "source": [
    "print soup.select('p a[href=\"http://example.com/elsie\"]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<type 'list'>\n",
      "The Dormouse's story\n",
      "The Dormouse's story\n"
     ]
    }
   ],
   "source": [
    "soup = BeautifulSoup(html, 'lxml')\n",
    "print type(soup.select('title'))\n",
    "print soup.select('title')[0].get_text()\n",
    "\n",
    "for title in soup.select('title'):\n",
    "    print title.get_text()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
